1 Data Acquisition

1.1 Challenge - Patent data

library(rvest)
library(stringi)
library(xopen)
library(jsonlite)
library(tidyverse)
library(glue)

url <- "https://www.radon-bikes.de"
#xopen(url)
html_home <- read_html(url)

#Get url for each of the product categories
bike_category_url_tbl <- html_home %>% html_nodes(".megamenu__item") %>% html_elements("a") %>% 
  html_attr('href') %>% enframe(name = 'position',value = 'title') %>% mutate(
    category_url=glue("https://www.radon-bikes.de{title}")
  ) %>% filter(position<9)

#Get the Model Names
get_bike_data <- function(link){
  model_infor <- read_html(link) %>% html_nodes(".o-slick-slider__slide-content") %>%
  html_element("h2") %>% html_text() %>% enframe(name = "position",value = 'model')
  
  #Get the sizes
  bike_sizes <- read_html(link) %>% html_nodes(".o-slick-slider__slide-content")%>%  
  html_node('.m-bikegrid__size') %>% html_text() %>% str_extract("[0-9].+")%>% 
 enframe(name = "position",value = "size")
  
#Get the prices
  bike_prices <- read_html(link) %>% html_nodes(".m-serienpanel__price--active") %>% html_text() %>% 
    str_remove("^ .$") %>%  stri_remove_empty() %>%  enframe(name='position',value = 'price')
  
#combine three using position
   model_infor %>% left_join(bike_sizes) %>% left_join(bike_prices) %>%
    filter(position!=23)
}

# Get the URl for 1st category
category_1 <-  bike_category_url_tbl$category_url[2] 
bike_model<- get_bike_data(link = category_1)

bike_category_url_data <- bike_category_url_tbl %>% pull(category_url) 

bike_data_list <- map(bike_category_url_data,get_bike_data) 

bike_data_tbl <- bind_rows(bike_data_list) 

#removing "Euro" symbol from price
bike_data_tbl$price<-gsub("€","",as.character(bike_data_tbl$price))%>% na.exclude()
bike_data_tbl  %>% head(n=10)# display 10 rows

1.2 Business case - Loan data

# Tidyverse
library(tidyverse)
library(vroom)

# Data Table
library(data.table)

# Counter
library(tictoc)

# 2.0 DATA IMPORT ----

# 2.1 Loan Acquisitions Data ----

col_types_acq <- list(
  loan_id                            = col_factor(),
  original_channel                   = col_factor(NULL),
  seller_name                        = col_factor(NULL),
  original_interest_rate             = col_double(),
  original_upb                       = col_integer(),
  original_loan_term                 = col_integer(),
  original_date                      = col_date("%m/%Y"),
  first_pay_date                     = col_date("%m/%Y"),
  original_ltv                       = col_double(),
  original_cltv                      = col_double(),
  number_of_borrowers                = col_double(),
  original_dti                       = col_double(),
  original_borrower_credit_score     = col_double(),
  first_time_home_buyer              = col_factor(NULL),
  loan_purpose                       = col_factor(NULL),
  property_type                      = col_factor(NULL),
  number_of_units                    = col_integer(),
  occupancy_status                   = col_factor(NULL),
  property_state                     = col_factor(NULL),
  zip                                = col_integer(),
  primary_mortgage_insurance_percent = col_double(),
  product_type                       = col_factor(NULL),
  original_coborrower_credit_score   = col_double(),
  mortgage_insurance_type            = col_double(),
  relocation_mortgage_indicator      = col_factor(NULL))

acquisition_data <- vroom(
  file       = "00_data/loan_data/Acquisition_2019Q1.txt", 
  delim      = "|", 
  col_names  = names(col_types_acq),
  col_types  = col_types_acq,
  na         = c("", "NA", "NULL"))
acquisition_data %>% glimpse()
## Rows: 297,452
## Columns: 25
## $ loan_id                            <fct> 100000913397, 100017539727, 1000180~
## $ original_channel                   <fct> C, B, R, C, B, C, R, R, R, B, R, C,~
## $ seller_name                        <fct> "JPMORGAN CHASE BANK, NATIONAL ASSO~
## $ original_interest_rate             <dbl> 5.875, 4.750, 4.875, 4.875, 4.250, ~
## $ original_upb                       <int> 324000, 307000, 256000, 248000, 490~
## $ original_loan_term                 <int> 360, 360, 360, 360, 360, 360, 360, ~
## $ original_date                      <date> 2018-09-01, 2018-12-01, 2018-11-01~
## $ first_pay_date                     <date> 2018-11-01, 2019-02-01, 2019-01-01~
## $ original_ltv                       <dbl> 80, 90, 90, 90, 67, 69, 95, 80, 97,~
## $ original_cltv                      <dbl> 80, 90, 90, 90, 67, 69, 95, 80, 97,~
## $ number_of_borrowers                <dbl> 2, 1, 2, 2, 1, 2, 2, 1, 2, 2, 2, 1,~
## $ original_dti                       <dbl> 49, 44, 41, 40, 35, 31, 45, 47, 37,~
## $ original_borrower_credit_score     <dbl> 692, 722, 728, 730, 727, 798, 710, ~
## $ first_time_home_buyer              <fct> N, N, N, Y, Y, N, N, N, Y, Y, N, N,~
## $ loan_purpose                       <fct> C, P, P, P, P, P, P, P, P, P, P, P,~
## $ property_type                      <fct> PU, PU, SF, SF, CO, PU, PU, CO, SF,~
## $ number_of_units                    <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,~
## $ occupancy_status                   <fct> P, P, S, P, P, P, P, P, P, P, P, P,~
## $ property_state                     <fct> CA, TX, NC, IL, CA, FL, TX, TX, IN,~
## $ zip                                <int> 925, 770, 286, 600, 945, 337, 773, ~
## $ primary_mortgage_insurance_percent <dbl> NA, 25, 25, 25, NA, NA, 30, NA, 35,~
## $ product_type                       <fct> FRM, FRM, FRM, FRM, FRM, FRM, FRM, ~
## $ original_coborrower_credit_score   <dbl> 665, NA, 738, 791, NA, 810, 778, NA~
## $ mortgage_insurance_type            <dbl> NA, 1, 1, 1, NA, NA, 1, NA, 1, 1, 1~
## $ relocation_mortgage_indicator      <fct> N, N, N, N, N, N, Y, N, N, N, N, N,~
# 2.2 Performance Data ----
col_types_perf = list(
  loan_id                                = col_factor(),
  monthly_reporting_period               = col_date("%m/%d/%Y"),
  servicer_name                          = col_factor(NULL),
  current_interest_rate                  = col_double(),
  current_upb                            = col_double(),
  loan_age                               = col_double(),
  remaining_months_to_legal_maturity     = col_double(),
  adj_remaining_months_to_maturity       = col_double(),
  maturity_date                          = col_date("%m/%Y"),
  msa                                    = col_double(),
  current_loan_delinquency_status        = col_double(),
  modification_flag                      = col_factor(NULL),
  zero_balance_code                      = col_factor(NULL),
  zero_balance_effective_date            = col_date("%m/%Y"),
  last_paid_installment_date             = col_date("%m/%d/%Y"),
  foreclosed_after                       = col_date("%m/%d/%Y"),
  disposition_date                       = col_date("%m/%d/%Y"),
  foreclosure_costs                      = col_double(),
  prop_preservation_and_repair_costs     = col_double(),
  asset_recovery_costs                   = col_double(),
  misc_holding_expenses                  = col_double(),
  holding_taxes                          = col_double(),
  net_sale_proceeds                      = col_double(),
  credit_enhancement_proceeds            = col_double(),
  repurchase_make_whole_proceeds         = col_double(),
  other_foreclosure_proceeds             = col_double(),
  non_interest_bearing_upb               = col_double(),
  principal_forgiveness_upb              = col_double(),
  repurchase_make_whole_proceeds_flag    = col_factor(NULL),
  foreclosure_principal_write_off_amount = col_double(),
  servicing_activity_indicator           = col_factor(NULL))

performance_data <- vroom(
  file       = "00_data/loan_data/Performance_2019Q1.txt", 
  delim      = "|", 
  col_names  = names(col_types_perf),
  col_types  = col_types_perf,
  na         = c("", "NA", "NULL"))

#performance_data %>% glimpse()

# 3.1 Acquisition Data ----
class(acquisition_data)
## [1] "spec_tbl_df" "tbl_df"      "tbl"         "data.frame"
setDT(acquisition_data)

class(acquisition_data)
## [1] "data.table" "data.frame"
#acquisition_data %>% glimpse()

# 3.2 Performance Data ----
setDT(performance_data)

#performance_data %>% glimpse()

# 4.0 DATA WRANGLING ----

# 4.1 Joining / Merging Data ----

tic()
combined_data <- merge(x = acquisition_data, y = performance_data, 
                       by    = "loan_id", 
                       all.x = TRUE, 
                       all.y = FALSE)
toc()
## 1.53 sec elapsed
combined_data %>% glimpse()
## Rows: 3,105,040
## Columns: 55
## $ loan_id                                <fct> 100000913397, 100000913397, 100~
## $ original_channel                       <fct> C, C, C, C, C, C, C, C, C, C, C~
## $ seller_name                            <fct> "JPMORGAN CHASE BANK, NATIONAL ~
## $ original_interest_rate                 <dbl> 5.875, 5.875, 5.875, 5.875, 5.8~
## $ original_upb                           <int> 324000, 324000, 324000, 324000,~
## $ original_loan_term                     <int> 360, 360, 360, 360, 360, 360, 3~
## $ original_date                          <date> 2018-09-01, 2018-09-01, 2018-0~
## $ first_pay_date                         <date> 2018-11-01, 2018-11-01, 2018-1~
## $ original_ltv                           <dbl> 80, 80, 80, 80, 80, 80, 80, 80,~
## $ original_cltv                          <dbl> 80, 80, 80, 80, 80, 80, 80, 80,~
## $ number_of_borrowers                    <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2~
## $ original_dti                           <dbl> 49, 49, 49, 49, 49, 49, 49, 49,~
## $ original_borrower_credit_score         <dbl> 692, 692, 692, 692, 692, 692, 6~
## $ first_time_home_buyer                  <fct> N, N, N, N, N, N, N, N, N, N, N~
## $ loan_purpose                           <fct> C, C, C, C, C, C, C, C, C, C, C~
## $ property_type                          <fct> PU, PU, PU, PU, PU, PU, PU, PU,~
## $ number_of_units                        <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1~
## $ occupancy_status                       <fct> P, P, P, P, P, P, P, P, P, P, P~
## $ property_state                         <fct> CA, CA, CA, CA, CA, CA, CA, CA,~
## $ zip                                    <int> 925, 925, 925, 925, 925, 925, 9~
## $ primary_mortgage_insurance_percent     <dbl> NA, NA, NA, NA, NA, NA, NA, NA,~
## $ product_type                           <fct> FRM, FRM, FRM, FRM, FRM, FRM, F~
## $ original_coborrower_credit_score       <dbl> 665, 665, 665, 665, 665, 665, 6~
## $ mortgage_insurance_type                <dbl> NA, NA, NA, NA, NA, NA, NA, NA,~
## $ relocation_mortgage_indicator          <fct> N, N, N, N, N, N, N, N, N, N, N~
## $ monthly_reporting_period               <date> 2019-01-01, 2019-02-01, 2019-0~
## $ servicer_name                          <fct> "OTHER", "", "", "", "", "", ""~
## $ current_interest_rate                  <dbl> 5.875, 5.875, 5.875, 5.875, 5.8~
## $ current_upb                            <dbl> NA, NA, NA, NA, NA, NA, 320968.~
## $ loan_age                               <dbl> 3, 4, 5, 6, 7, 8, 9, 10, 11, 12~
## $ remaining_months_to_legal_maturity     <dbl> 357, 356, 355, 354, 353, 352, 3~
## $ adj_remaining_months_to_maturity       <dbl> 357, 356, 355, 354, 353, 352, 3~
## $ maturity_date                          <date> 2048-10-01, 2048-10-01, 2048-1~
## $ msa                                    <dbl> 40140, 40140, 40140, 40140, 401~
## $ current_loan_delinquency_status        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ modification_flag                      <fct> N, N, N, N, N, N, N, N, N, N, N~
## $ zero_balance_code                      <fct> , , , , , , , , , , , , , , , ,~
## $ zero_balance_effective_date            <date> NA, NA, NA, NA, NA, NA, NA, NA~
## $ last_paid_installment_date             <date> NA, NA, NA, NA, NA, NA, NA, NA~
## $ foreclosed_after                       <date> NA, NA, NA, NA, NA, NA, NA, NA~
## $ disposition_date                       <date> NA, NA, NA, NA, NA, NA, NA, NA~
## $ foreclosure_costs                      <dbl> NA, NA, NA, NA, NA, NA, NA, NA,~
## $ prop_preservation_and_repair_costs     <dbl> NA, NA, NA, NA, NA, NA, NA, NA,~
## $ asset_recovery_costs                   <dbl> NA, NA, NA, NA, NA, NA, NA, NA,~
## $ misc_holding_expenses                  <dbl> NA, NA, NA, NA, NA, NA, NA, NA,~
## $ holding_taxes                          <dbl> NA, NA, NA, NA, NA, NA, NA, NA,~
## $ net_sale_proceeds                      <dbl> NA, NA, NA, NA, NA, NA, NA, NA,~
## $ credit_enhancement_proceeds            <dbl> NA, NA, NA, NA, NA, NA, NA, NA,~
## $ repurchase_make_whole_proceeds         <dbl> NA, NA, NA, NA, NA, NA, NA, NA,~
## $ other_foreclosure_proceeds             <dbl> NA, NA, NA, NA, NA, NA, NA, NA,~
## $ non_interest_bearing_upb               <dbl> NA, NA, NA, NA, NA, NA, NA, NA,~
## $ principal_forgiveness_upb              <dbl> NA, NA, NA, NA, NA, NA, NA, NA,~
## $ repurchase_make_whole_proceeds_flag    <fct> , , , , , , , , , , , , , , , ,~
## $ foreclosure_principal_write_off_amount <dbl> NA, NA, NA, NA, NA, NA, NA, NA,~
## $ servicing_activity_indicator           <fct> N, N, N, N, N, N, N, N, N, N, N~
# Same operation with dplyr
tic()
performance_data %>%
  left_join(acquisition_data, by = "loan_id")
toc()
## 3.35 sec elapsed
# Preparing the Data Table

setkey(combined_data, "loan_id")
key(combined_data)
## [1] "loan_id"
#?setorder()
setorderv(combined_data, c("loan_id", "monthly_reporting_period"))

# 4.3 Select Columns ----
combined_data %>% dim()
## [1] 3105040      55
keep_cols <- c("loan_id",
               "monthly_reporting_period",
               "seller_name",
               "current_interest_rate",
               "current_upb",
               "loan_age",
               "remaining_months_to_legal_maturity",
               "adj_remaining_months_to_maturity",
               "current_loan_delinquency_status",
               "modification_flag",
               "zero_balance_code",
               "foreclosure_costs",
               "prop_preservation_and_repair_costs",
               "asset_recovery_costs",
               "misc_holding_expenses",
               "holding_taxes",
               "net_sale_proceeds",
               "credit_enhancement_proceeds",
               "repurchase_make_whole_proceeds",
               "other_foreclosure_proceeds",
               "non_interest_bearing_upb",
               "principal_forgiveness_upb",
               "repurchase_make_whole_proceeds_flag",
               "foreclosure_principal_write_off_amount",
               "servicing_activity_indicator",
               "original_channel",
               "original_interest_rate",
               "original_upb",
               "original_loan_term",
               "original_ltv",
               "original_cltv",
               "number_of_borrowers",
               "original_dti",
               "original_borrower_credit_score",
               "first_time_home_buyer",
               "loan_purpose",
               "property_type",
               "number_of_units",
               "property_state",
               "occupancy_status",
               "primary_mortgage_insurance_percent",
               "product_type",
               "original_coborrower_credit_score",
               "mortgage_insurance_type",
               "relocation_mortgage_indicator")

combined_data <- combined_data[, ..keep_cols]

combined_data %>% dim()
## [1] 3105040      45
#combined_data %>% glimpse()

# 4.4 Grouped Mutations ----
# - Add response variable (Predict wether loan will become delinquent in next 3 months)

# dplyr
tic()
temp <- combined_data %>%
  group_by(loan_id) %>%
  mutate(gt_1mo_behind_in_3mo_dplyr = lead(current_loan_delinquency_status, n = 3) >= 1) %>%
  ungroup()  
toc()
## 12.17 sec elapsed
combined_data %>% dim()
## [1] 3105040      45
temp %>% dim()
## [1] 3105040      46
# data.table
tic()
combined_data[, gt_1mo_behind_in_3mo := lead(current_loan_delinquency_status, n = 3) >= 1,
              by = loan_id]
toc()
## 12 sec elapsed
combined_data %>% dim()
## [1] 3105040      46
# Remove the temp variable
rm(temp)

# 5.1 How many loans in a month ----
tic()
combined_data[!is.na(monthly_reporting_period), .N, by = monthly_reporting_period]
toc()
## 0.25 sec elapsed
tic()
combined_data %>%
  filter(!is.na(monthly_reporting_period)) %>%
  count(monthly_reporting_period) 
toc()
## 2.24 sec elapsed
# 5.2 Which loans have the most outstanding delinquencies ----
# data.table
tic()
combined_data[current_loan_delinquency_status >= 1, 
              list(loan_id, monthly_reporting_period, current_loan_delinquency_status, seller_name, current_upb)][
                , max(current_loan_delinquency_status), by = loan_id][
                  order(V1, decreasing = TRUE)]
toc()
## 0.29 sec elapsed
# 5.3 Get last unpaid balance value for delinquent loans ----
# data.table
tic()
combined_data[current_loan_delinquency_status >= 1, .SD[.N], by = loan_id][
  !is.na(current_upb)][
    order(-current_upb), .(loan_id, monthly_reporting_period, current_loan_delinquency_status, seller_name, current_upb)  
  ]
toc()
## 3.55 sec elapsed
# 5.4 Loan Companies with highest unpaid balance
# data.table
tic()
upb_by_company_dt <- combined_data[!is.na(current_upb), .SD[.N], by = loan_id][
  , .(sum_current_upb = sum(current_upb, na.rm = TRUE), cnt_current_upb = .N), by = seller_name][
    order(sum_current_upb, decreasing = TRUE)]
toc()
## 39.33 sec elapsed
upb_by_company_dt